-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Implement codegen for GFX11+ V_CVT_PK_[IU]16_F32 #168719
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Jay Foad (jayfoad) ChangesPatch is 50.25 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168719.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e37d739fc25df..0a9ae52d5dabf 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6806,6 +6806,11 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return splitTernaryVectorOp(Op, DAG);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
+ if (AMDGPU::isGFX11Plus(*Subtarget) && Op.getValueType() == MVT::i16 &&
+ Op.getOperand(0).getValueType() == MVT::f32) {
+ // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
+ return Op;
+ }
return LowerFP_TO_INT(Op, DAG);
case ISD::SHL:
case ISD::SRA:
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 1931e0be15152..936eb545563ae 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -469,6 +469,14 @@ let SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE],
} // End SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE]
// SchedRW = [WriteSFPU], isReMaterializable = 1
+let SubtargetPredicate = HasSALUFloatInsts, AddedComplexity = 9 in {
+ // Fallback patterns for f32->i16 conversion.
+ def : GCNPat<(i16 (UniformUnaryFrag<fp_to_sint> f32:$src0)),
+ (S_CVT_I32_F32 $src0)>;
+ def : GCNPat<(i16 (UniformUnaryFrag<fp_to_uint> f32:$src0)),
+ (S_CVT_U32_F32 $src0)>;
+}
+
let hasSideEffects = 1 in {
let has_sdst = 0 in {
let Uses = [M0] in {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 872bde501cd2d..e1b22c6804544 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1721,6 +1721,28 @@ let SubtargetPredicate = isGFX11Plus in {
defm V_MINMAX_I32 : VOP3Inst<"v_minmax_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
defm V_CVT_PK_I16_F32 : VOP3Inst<"v_cvt_pk_i16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
+
+ def : GCNPat<(v2i16 (build_vector (i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (i16 (fp_to_sint (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))),
+ (V_CVT_PK_I16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>;
+ def : GCNPat<(v2i16 (build_vector (i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (i16 (fp_to_uint (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))),
+ (V_CVT_PK_U16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>;
+
+ // Fallback patterns for f32->i16 conversion. These are only required because
+ // f32->i16 has to be legal so that we can select V_CVT_PK_[IU]16_F32 above.
+ let True16Predicate = UseRealTrue16Insts in {
+ def : GCNPat<(i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (EXTRACT_SUBREG (V_CVT_I32_F32_e64 $src0_modifiers, $src0), lo16)>;
+ def : GCNPat<(i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (EXTRACT_SUBREG (V_CVT_U32_F32_e64 $src0_modifiers, $src0), lo16)>;
+ }
+ let True16Predicate = NotUseRealTrue16Insts in {
+ def : GCNPat<(i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (V_CVT_I32_F32_e64 $src0_modifiers, $src0)>;
+ def : GCNPat<(i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (V_CVT_U32_F32_e64 $src0_modifiers, $src0)>;
+ }
} // End SubtargetPredicate = isGFX11Plus
class VOP3_CVT_SR_FP16_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_F1632_FP8BF8_TiedInput_Profile<P> {
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 28d7e6916e519..c9b2c8c08b41c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -35513,55 +35513,24 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11TRUE16-LABEL: v_fptosi_v2bf16_to_v2i16:
-; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16:
-; GFX11FAKE16: ; %bb.0:
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
-; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1250TRUE16-LABEL: v_fptosi_v2bf16_to_v2i16:
-; GFX1250TRUE16: ; %bb.0:
-; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
-; GFX1250TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX1250TRUE16-NEXT: s_set_pc_i64 s[30:31]
+; GFX11-LABEL: v_fptosi_v2bf16_to_v2i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_pk_i16_f32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1250FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16:
-; GFX1250FAKE16: ; %bb.0:
-; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX1250FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
-; GFX1250FAKE16-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-LABEL: v_fptosi_v2bf16_to_v2i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_pk_i16_f32 v0, v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = fptosi <2 x bfloat> %x to <2 x i16>
ret <2 x i16> %op
}
@@ -35655,61 +35624,27 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11TRUE16-LABEL: v_fptosi_v3bf16_to_v3i16:
-; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16:
-; GFX11FAKE16: ; %bb.0:
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
-; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1250TRUE16-LABEL: v_fptosi_v3bf16_to_v3i16:
-; GFX1250TRUE16: ; %bb.0:
-; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX1250TRUE16-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX1250TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX1250TRUE16-NEXT: s_set_pc_i64 s[30:31]
+; GFX11-LABEL: v_fptosi_v3bf16_to_v3i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_pk_i16_f32 v0, v0, v2
+; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1250FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16:
-; GFX1250FAKE16: ; %bb.0:
-; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250FAKE16-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1250FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX1250FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
-; GFX1250FAKE16-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-LABEL: v_fptosi_v3bf16_to_v3i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cvt_pk_i16_f32 v0, v0, v2
+; GFX1250-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = fptosi <3 x bfloat> %x to <3 x i16>
ret <3 x i16> %op
}
@@ -35827,77 +35762,29 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11TRUE16-LABEL: v_fptosi_v4bf16_to_v4i16:
-; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
-; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16:
-; GFX11FAKE16: ; %bb.0:
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
-; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
-; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1250TRUE16-LABEL: v_fptosi_v4bf16_to_v4i16:
-; GFX1250TRUE16: ; %bb.0:
-; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX1250TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX1250TRUE16-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX1250TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
-; GFX1250TRUE16-NEXT: s_set_pc_i64 s[30:31]
+; GFX11-LABEL: v_fptosi_v4bf16_to_v4i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_pk_i16_f32 v0, v0, v3
+; GFX11-NEXT: v_cvt_pk_i16_f32 v1, v1, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1250FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16:
-; GFX1250FAKE16: ; %bb.0:
-; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250FAKE16-NEXT: v_dual_lshlrev_b32 v2, 16, v1 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX1250FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1250FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
-; GFX1250FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
-; GFX1250FAKE16-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-LABEL: v_fptosi_v4bf16_to_v4i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cvt_pk_i16_f32 v0, v0, v3
+; GFX1250-NEXT: v_cvt_pk_i16_f32 v1, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = fptosi <4 x bfloat> %x to <4 x i16>
ret <4 x i16> %op
}
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index 0c5ed00b58d90..dea2dbe18a05d 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn < %s | FileCheck %s --check-prefixes=SI
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck %s --check-prefixes=VI
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck %s --check-prefixes=EG
declare float @llvm.fabs.f32(float) #1
@@ -28,6 +29,17 @@ define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) {
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
+; GFX11-LABEL: fp_to_sint_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_cvt_i32_f32_e32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+;
; EG-LABEL: fp_to_sint_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -67,6 +79,17 @@ define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
+; GFX11-LABEL: fp_to_sint_i32_fabs:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_cvt_i32_f32_e64 v1, |s2|
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+;
; EG-LABEL: fp_to_sint_i32_fabs:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -108,6 +131,16 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> %
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
+; GFX11-LABEL: fp_to_sint_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_cvt_i32_f32_e32 v1, s3
+; GFX11-NEXT: v_cvt_i32_f32_e32 v0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
+;
; EG-LABEL: fp_to_sint_v2i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
@@ -157,6 +190,20 @@ define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
+; GFX11-LABEL: fp_to_sint_v4i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_cvt_i32_f3...
[truncated]
|
🐧 Linux x64 Test Results
|
broxigarchen
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
| return splitTernaryVectorOp(Op, DAG); | ||
| case ISD::FP_TO_SINT: | ||
| case ISD::FP_TO_UINT: | ||
| if (AMDGPU::isGFX11Plus(*Subtarget) && Op.getValueType() == MVT::i16 && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't use AMDGPU::isGFX11Plus, use the method directly on the subtarget. The standalone predicates are a hack for MC.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also better to not do this with a generation check
| Op.getOperand(0).getValueType() == MVT::f32) { | ||
| // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32. | ||
| return Op; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not sink this into LowerFP_TO_INT?
No description provided.